ALY6010: Final Project



Name : Jeseeka Shah, Sanjana Chaudhari and Supreeth Murugesh
Instructor : Prof. Amin Karimpour
Class : Probability Theory and Introductory Statistics
School : Northeastern University - College of Professional Studies
Date of submission : 12 December, 2021


Dataset

# Loading the dataset 
superstore_2011_2015 <- read.csv("Dataset/superstore-2011-2015.csv")

# Considering only the US market for Analysis 
superstore_2011_2015 <- filter(superstore_2011_2015,  superstore_2011_2015$Market == "US")

# Omitting NA values
superstore_2011_2015 <- na.omit(superstore_2011_2015)

# Calculating Total Sales
superstore_2011_2015 <- mutate(superstore_2011_2015, 
                               Total_Sales = superstore_2011_2015$Sales * superstore_2011_2015$Quantity)

Descriptive Statistics of numerical variables

columnNames <- c('Sales', 'Discount', 'Profit', 'Shipping Cost', 'Total Sales')
rowNames <- c('Mean', 'Standard Deviation', 'Range', 'Median')

# Descriptive statistics of Discount 
meanDiscount <- round(mean(superstore_2011_2015$Discount),2)
stdDiscount <- round(sd(superstore_2011_2015$Discount),4)
rangeDiscount <- round(max(superstore_2011_2015$Discount) - min(superstore_2011_2015$Discount),2)
medianDiscount <- median(superstore_2011_2015$Discount)  

discountStats <- rbind(meanDiscount, stdDiscount, rangeDiscount, medianDiscount)

# Descriptive statistics of Sales
meanSales <- mean(superstore_2011_2015$Sales)
stdSales <- round(sd(superstore_2011_2015$Sales),2)
rangeSales <- max(superstore_2011_2015$Sales) - min(superstore_2011_2015$Sales)
medianSales <- median(superstore_2011_2015$Sales)

salesStats <- rbind(meanSales, stdSales, rangeSales, medianSales)

# Descriptive statistics of Profits
meanProfit <- mean(superstore_2011_2015$Profit)
stdProfit <- round(sd(superstore_2011_2015$Profit),2)
rangeProfit <- max(superstore_2011_2015$Profit) - min(superstore_2011_2015$Profit)
medianProfit <- median(superstore_2011_2015$Profit)

ProfitStats <- rbind(meanProfit, stdProfit, rangeProfit, medianProfit)

# Descriptive statistics of Shipping cost of the product
meanShippingCost <- mean(superstore_2011_2015$Shipping.Cost)
stdShippingCost <- round(sd(superstore_2011_2015$Shipping.Cost),2)
rangeShippingCost <- max(superstore_2011_2015$Shipping.Cost) - min(superstore_2011_2015$Shipping.Cost)
medianShippingCost <- median(superstore_2011_2015$Shipping.Cost)

ShippingCostStats <- rbind(meanShippingCost, stdShippingCost, rangeShippingCost, medianShippingCost)
combinedData <- cbind(discountStats, salesStats, ProfitStats, ShippingCostStats)

tableData <- combinedData
rownames(tableData) <- c('Mean', 'Standard Deviation', 'Range', 'Median')
colnames(tableData) <- c('Discount', 'Sales', 'Profit', 'Shipping Cost')
kable(tableData, caption = "<center><strong>Table 1: Statistics of 4 numerical variables</strong></center>", escape = FALSE,
    format = "html") %>% kable_material(c("striped", "hover"))
Table 1: Statistics of 4 numerical variables
Discount Sales Profit Shipping Cost
Mean 0.1600 229.858 28.6569 23.83168
Standard Deviation 0.2065 623.250 234.2600 58.96000
Range 0.8000 22638.036 14999.9540 933.56000
Median 0.2000 54.490 8.6665 5.10000

Exploratory Data analysis

par(mar=c(5,8,4,2))
# Plot: Orders by each categories
colorname=RColorBrewer::brewer.pal(5,"Dark2") 
y= barplot(table(superstore_2011_2015$Category), 
                 ylim=c(0,8000), 
                 col = colorname, 
                 border="black", 
                 main = "Bar Plot for Category Count", 
                 xlab = "Category", 
                 ylab = "Freqency of Category Count", 
                 col.main="burlywood4", 
                 col.lab="burlywood4") 

par(mar=c(5,8,4,2))
#Plot: Sales for different 
ggplot() + geom_jitter(data = superstore_2011_2015, 
                       aes(x = Region, y = Sales, fill = Region, colour=Region), 
                       stat = "identity") + 
  labs(x = "Region", y = "Sales ($)", title = "Sales Distribution in different regions", subtitle = "Dataset: Superstore Data") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
  theme(plot.title = element_text(size=12, hjust=0.5, colour="black", vjust=-1)) +
  theme(plot.subtitle = element_text(size=9, hjust=0.5, colour="black", vjust=-1))

par(mar=c(5,8,4,2))
#Plot: Sales Vs. Profit with mean and Linear Regression

plot(superstore_2011_2015$Sales, y=superstore_2011_2015$Profit, 
     pch = 19, col = "orange", fit="lm", 
     xlab = "Sales ($)",
     ylab = "Profit ($)",
     main = "Sales vs Distribution")  
abline(h = mean(superstore_2011_2015$Profit), col = 'steelblue', lwd = 3, lty = 2)  
reg_model <- lm(Profit ~ Sales, data = superstore_2011_2015) 
abline(reg_model, col="purple")  

par(mar=c(5,8,4,2))
# Plot: Profit made by catergories
superstore_2011_2015 %>% ggplot(aes(x = Category, y = Profit)) + 
         geom_point(aes(width = 0.9, outlier.shape = NA,color = Category)) +
  labs(title = "Profit distribution in Product categories", 
       subtitle = "Dataset: Superstore Data",
       color = "Category") +
  theme(plot.title = element_text(size=12, hjust=0.5, colour="black", vjust=-1)) +
  theme(plot.subtitle = element_text(size=9, hjust=0.5, colour="black", vjust=-1))
## Warning: Ignoring unknown aesthetics: width, outlier.shape

par(mar=c(5,8,4,2))
# Plot: Shipmode and Shipping cost
ggplot(superstore_2011_2015, 
       aes(x = Ship.Mode, 
           y = Shipping.Cost, 
           color = Ship.Mode)) + 
  geom_jitter(position=position_jitter(0.2), shape=17) +
  theme_minimal() + 
  labs(x="Shipmode", y="Shipping Cost($)", 
       title = "Shipping cost for different shipping modes", 
       subtitle = "Dataset: Superstore Data",
       color = "Shipping Mode") +
  theme(plot.title = element_text(size=12, hjust=0.5, colour="black", vjust=-1)) +
  theme(plot.subtitle = element_text(size=9, hjust=0.5, colour="black", vjust=-1)) + 
  scale_colour_manual(values=c("#AA2A64", "#BF403B", "#274530", "#779645")) +
  scale_shape_manual(values=c("#EDB361", "#BF403B", "#274530", "#779645")) 

# Plot: Order count in each product category
par(mar=c(5,8,4,2))
categoryCount <- table(superstore_2011_2015$Category)
categoryCountPlot <- barplot(categoryCount, 
                              main = "Order count in each product category", 
                              ylab = "Count",
                              xlab = "Category",
                              width = 0.7, space = 0.15, ylim = c(0,7000), 
                              cex.axis = 0.7, las = 1,
                             col = c("#B72E56", "#61B94C", "#7006C2"), border = NA)
text(categoryCountPlot, categoryCount, 
     labels = categoryCount, col = "white", pos = 1, cex= 0.8)

# Plot: Profit distribution
par(mar=c(5,8,4,2))
lossOutlier = which(superstore_2011_2015$Profit < 1.7280 | superstore_2011_2015$Profit > 70.7220)
lossOutlierData <- superstore_2011_2015[-lossOutlier,]
hist(lossOutlierData$Profit,
     breaks = 25,
     ylim = c(0,1000),
     main="Histogram of Profit earned",
     xlab = "Profit ($)",
     ylab = "Count",
     col = terrain.colors(10))

par(mar=c(5,8,4,2))
# Plot: Scatter plot of Total Sales vs Profit vs Discount
plot_ly(x=superstore_2011_2015$Total_Sales, 
        y=superstore_2011_2015$Profit, 
        z=superstore_2011_2015$Discount, 
        type="scatter3d", mode="markers", color=superstore_2011_2015$Sales) %>%
  layout(scene = list(xaxis = list(title = "Sales"), 
                      yaxis = list(title = "Profit"), 
                      zaxis = list(title = "Discount Percentage")))
# Plot 10: Time series analysis of Profit

# Typecasting to date
superstore_2011_2015$OrderDateColumn <- as.Date(superstore_2011_2015$`Order.Date`, "%m/%d/%Y")
superstore_2011_2015$OrderDateColumn <- format(as.Date(superstore_2011_2015$OrderDateColumn), "%Y-%m")

# Group by date and calculate Profit and Loss
timeSeriesProfitData <- aggregate(`Sales` ~ OrderDateColumn + Category, 
                                  data = superstore_2011_2015, FUN = sum, na.rm = TRUE)

ggplot(timeSeriesProfitData, aes(x = factor(OrderDateColumn), y = Sales, group = Category, color = Category)) +  
  geom_line(size = 1) +
  scale_color_manual(values=c("#FDC534", "#007FFE", "#D61E29")) +
  theme_gray() +
  labs(x="Date", y="Profit($)", 
       title = "Profit obtained for different product categories", 
       subtitle = "Dataset: Superstore Data") +
  theme(axis.text.x = element_text(angle = 45, hjust=1, size = 6)) +
  theme(axis.text.y = element_text(hjust=1, size = 14)) +
  theme(plot.title = element_text(size=12, hjust=0.5, colour="black", vjust=-1)) +
  theme(plot.subtitle = element_text(size=9, hjust=0.5, colour="black", vjust=-1))


Hypothesis Testing


1. Hypothesis:

Ho : Average profit of the company in the US region is not $25
Ha : Average profit of the company in the US region is $25

#Ho : Average profit of the company in the US region is not $25 
#Ha : Average profit of the company in the US region is $25 

# t-test (One sample)

oneSampleData <- superstore_2011_2015[sample(nrow(superstore_2011_2015), 29), ]
t.test(oneSampleData$Profit, conf.level = .95, mu = 25)  
## 
##  One Sample t-test
## 
## data:  oneSampleData$Profit
## t = -2.4863, df = 28, p-value = 0.01914
## alternative hypothesis: true mean is not equal to 25
## 95 percent confidence interval:
##  -9.408078 21.677195
## sample estimates:
## mean of x 
##  6.134559

2. Hypothesis:

Ho : True difference in average profit of Office Supplies and Technology is not 0
Ha : True difference in average profit of Office Supplies and Technology is 0

OfficeSuppliesProfit <- subset(superstore_2011_2015, Category =="Office Supplies") 
TechnologyProfit <- subset(superstore_2011_2015, Category =="Technology") 

OfficeSuppliesRandomSample <- OfficeSuppliesProfit[sample(nrow(OfficeSuppliesProfit), 25), ]
TechnologyRandomSample <- TechnologyProfit[sample(nrow(TechnologyProfit), 25), ]

t.test(OfficeSuppliesRandomSample$Profit, TechnologyRandomSample$Profit)
## 
##  Welch Two Sample t-test
## 
## data:  OfficeSuppliesRandomSample$Profit and TechnologyRandomSample$Profit
## t = -2.1034, df = 25.803, p-value = 0.04534
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -395.223587   -4.473461
## sample estimates:
## mean of x mean of y 
##  -15.0338  184.8147

3. Hypothesis:

Ho : True difference in average sales of West and East region is not 0
Ha : True difference in average sales of West and East region is 0

# t-test (Two sample)
westRegionSales <- subset(superstore_2011_2015, Region =="West") 
eastRegionSales <- subset(superstore_2011_2015, Region =="East") 

westRandomSample <- westRegionSales[sample(nrow(westRegionSales), 20), ]
eastRandomSample <- eastRegionSales[sample(nrow(eastRegionSales), 20), ]

t.test(westRandomSample$Sales, eastRandomSample$Sales, conf.level = 0.95) 
## 
##  Welch Two Sample t-test
## 
## data:  westRandomSample$Sales and eastRandomSample$Sales
## t = -0.31458, df = 37.605, p-value = 0.7548
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -304.9933  222.9776
## sample estimates:
## mean of x mean of y 
##  210.8363  251.8442

Data distribution of Shipping cost for Corporate vs Consumer segments

consumerShippingCost <- subset(superstore_2011_2015, Segment =="Consumer") 
corporateShippingCost <- subset(superstore_2011_2015, Segment =="Corporate") 


plotData <- subset(superstore_2011_2015, Segment =="Corporate" | Segment =="Consumer") 
ggplot(plotData, aes(x = plotData$Segment, y = plotData$Shipping.Cost, 
                                 color=plotData$Segment)) + 
  geom_jitter(position=position_jitter(0.2)) +
  theme_light() + 
  labs(y="Shipping costs ($)", x="Segments", title = "Shipping cost for Corporate vs Consumer segments", 
       subtitle = "Dataset: Superstore data") +
  theme(plot.title = element_text(size=14, hjust=0.5, colour="black", vjust=-1)) +
  theme(plot.subtitle = element_text(size=10, hjust=0.5, colour="black", vjust=-1)) + 
   scale_colour_manual(values = c("#4AB097", "#BB203C", "#FB761F")) +
  scale_shape_manual(values=c(1,17,19))


4. Hypothesis:

Ho : True difference in average shipping cost of Consumer and Corporate segments is not 0
Ha : True difference in average shipping cost of Consumer and Corporate segments is 0

# t-test (Two sample)
consumerRandomSample <- consumerShippingCost[sample(nrow(consumerShippingCost), 24), ]
corporateRandomSample <- corporateShippingCost[sample(nrow(corporateShippingCost), 24), ]

t.test(consumerRandomSample$Shipping.Cost, corporateRandomSample$Shipping.Cost, conf.level = 0.95) 
## 
##  Welch Two Sample t-test
## 
## data:  consumerRandomSample$Shipping.Cost and corporateRandomSample$Shipping.Cost
## t = 1.9364, df = 24.72, p-value = 0.06434
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.946117 94.686117
## sample estimates:
## mean of x mean of y 
##  59.25375  13.38375

5. Hypothesis:

Ho : Company’s average sales is higher in the West region than East region of the US
Ha : Company’s average sales is higher in the East region than West region of the US

# t-test (Two sample)
t.test(westRandomSample$Sales, eastRandomSample$Sales, conf.level = 0.95,
       alternative = "greater") 
## 
##  Welch Two Sample t-test
## 
## data:  westRandomSample$Sales and eastRandomSample$Sales
## t = -0.31458, df = 37.605, p-value = 0.6226
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -260.8419       Inf
## sample estimates:
## mean of x mean of y 
##  210.8363  251.8442

Correlation and Regression analysis

# Correlation between Sale, Discount, Profit, Quantity and Shipping cost

corrData <- data.frame(Sales = superstore_2011_2015$Sales,
                       Discount = superstore_2011_2015$Discount,
                       Profit = superstore_2011_2015$Profit,
                       Quantity = superstore_2011_2015$Quantity,
                       `Shipping Cost` = superstore_2011_2015$Shipping.Cost)
ggcorrplot(cor(corrData), 
           hc.order = TRUE, 
           type = "lower", lab = TRUE, 
           insig = "blank",
           ggtheme = ggplot2::theme_grey(),
           title = "Correlation between Company's Sales stats")

# Linear Regression Model
model2 <- lm(Shipping.Cost ~ Sales + Category, data = superstore_2011_2015)
summary(model2)
## 
## Call:
## lm(formula = Shipping.Cost ~ Sales + Category, data = superstore_2011_2015)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1398.26   -10.55    -4.82    -2.02   773.25 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              16.053203   0.966937  16.602   <2e-16 ***
## Sales                     0.062090   0.000709  87.579   <2e-16 ***
## CategoryOffice Supplies -11.032639   1.098905 -10.040   <2e-16 ***
## CategoryTechnology        0.858914   1.371787   0.626    0.531    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43.04 on 9990 degrees of freedom
## Multiple R-squared:  0.4673, Adjusted R-squared:  0.4671 
## F-statistic:  2921 on 3 and 9990 DF,  p-value: < 2.2e-16
# PLot1

p1 <- ggplot(superstore_2011_2015, aes(x = Sales, 
                             y = Shipping.Cost,
                             col = Profit)) + 
  xlim(c(0,7500)) + ylim(c(0,1000)) +
  geom_point() +
  theme_minimal() +
  theme(axis.text=element_text(size=12)) +
  geom_smooth(method="lm", formula = y~x, color = "red") +
  facet_grid(~Segment)
p1 + scale_color_gradient(low="pink", high="blue")
## Warning: Removed 13 rows containing non-finite values (stat_smooth).
## Warning: Removed 13 rows containing missing values (geom_point).